@@ -107,85 +107,97 @@ module Agents |
||
| 107 | 107 |
log "Fetching #{options['url']}"
|
| 108 | 108 |
request_opts = { :followlocation => true }
|
| 109 | 109 |
request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present? |
| 110 |
- request = Typhoeus::Request.new(options['url'], request_opts) |
|
| 111 | 110 |
|
| 112 |
- request.on_failure do |response| |
|
| 113 |
- error "Failed: #{response.inspect}"
|
|
| 111 |
+ requests = [] |
|
| 112 |
+ |
|
| 113 |
+ if options['url'].kind_of?(Array) |
|
| 114 |
+ options['url'].each do |url| |
|
| 115 |
+ requests.push(Typhoeus::Request.new(url, request_opts)) |
|
| 116 |
+ end |
|
| 117 |
+ else |
|
| 118 |
+ requests.push(Typhoeus::Request.new(options['url'], request_opts)) |
|
| 114 | 119 |
end |
| 115 | 120 |
|
| 116 |
- request.on_success do |response| |
|
| 117 |
- body = response.body |
|
| 118 |
- if (encoding = options['force_encoding']).present? |
|
| 119 |
- body = body.encode(Encoding::UTF_8, encoding) |
|
| 121 |
+ requests.each do |request| |
|
| 122 |
+ request.on_failure do |response| |
|
| 123 |
+ error "Failed: #{response.inspect}"
|
|
| 120 | 124 |
end |
| 121 |
- doc = parse(body) |
|
| 122 | 125 |
|
| 123 |
- if extract_full_json? |
|
| 124 |
- if store_payload!(previous_payloads(1), doc) |
|
| 125 |
- log "Storing new result for '#{name}': #{doc.inspect}"
|
|
| 126 |
- create_event :payload => doc |
|
| 126 |
+ request.on_success do |response| |
|
| 127 |
+ body = response.body |
|
| 128 |
+ if (encoding = options['force_encoding']).present? |
|
| 129 |
+ body = body.encode(Encoding::UTF_8, encoding) |
|
| 127 | 130 |
end |
| 128 |
- else |
|
| 129 |
- output = {}
|
|
| 130 |
- options['extract'].each do |name, extraction_details| |
|
| 131 |
- if extraction_type == "json" |
|
| 132 |
- result = Utils.values_at(doc, extraction_details['path']) |
|
| 133 |
- log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
|
|
| 134 |
- else |
|
| 135 |
- case |
|
| 136 |
- when css = extraction_details['css'] |
|
| 137 |
- nodes = doc.css(css) |
|
| 138 |
- when xpath = extraction_details['xpath'] |
|
| 139 |
- nodes = doc.xpath(xpath) |
|
| 131 |
+ doc = parse(body) |
|
| 132 |
+ |
|
| 133 |
+ if extract_full_json? |
|
| 134 |
+ if store_payload!(previous_payloads(1), doc) |
|
| 135 |
+ log "Storing new result for '#{name}': #{doc.inspect}"
|
|
| 136 |
+ create_event :payload => doc |
|
| 137 |
+ end |
|
| 138 |
+ else |
|
| 139 |
+ output = {}
|
|
| 140 |
+ options['extract'].each do |name, extraction_details| |
|
| 141 |
+ if extraction_type == "json" |
|
| 142 |
+ result = Utils.values_at(doc, extraction_details['path']) |
|
| 143 |
+ log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
|
|
| 140 | 144 |
else |
| 141 |
- error "'css' or 'xpath' is required for HTML or XML extraction" |
|
| 142 |
- return |
|
| 143 |
- end |
|
| 144 |
- unless Nokogiri::XML::NodeSet === nodes |
|
| 145 |
- error "The result of HTML/XML extraction was not a NodeSet" |
|
| 146 |
- return |
|
| 147 |
- end |
|
| 148 |
- result = nodes.map { |node|
|
|
| 149 |
- if extraction_details['attr'] |
|
| 150 |
- node.attr(extraction_details['attr']) |
|
| 151 |
- elsif extraction_details['text'] |
|
| 152 |
- node.text() |
|
| 145 |
+ case |
|
| 146 |
+ when css = extraction_details['css'] |
|
| 147 |
+ nodes = doc.css(css) |
|
| 148 |
+ when xpath = extraction_details['xpath'] |
|
| 149 |
+ nodes = doc.xpath(xpath) |
|
| 153 | 150 |
else |
| 154 |
- error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
| 151 |
+ error "'css' or 'xpath' is required for HTML or XML extraction" |
|
| 152 |
+ return |
|
| 153 |
+ end |
|
| 154 |
+ unless Nokogiri::XML::NodeSet === nodes |
|
| 155 |
+ error "The result of HTML/XML extraction was not a NodeSet" |
|
| 155 | 156 |
return |
| 156 | 157 |
end |
| 157 |
- } |
|
| 158 |
- log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
|
|
| 158 |
+ result = nodes.map { |node|
|
|
| 159 |
+ if extraction_details['attr'] |
|
| 160 |
+ node.attr(extraction_details['attr']) |
|
| 161 |
+ elsif extraction_details['text'] |
|
| 162 |
+ node.text() |
|
| 163 |
+ else |
|
| 164 |
+ error "'attr' or 'text' is required on HTML or XML extraction patterns" |
|
| 165 |
+ return |
|
| 166 |
+ end |
|
| 167 |
+ } |
|
| 168 |
+ log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
|
|
| 169 |
+ end |
|
| 170 |
+ output[name] = result |
|
| 159 | 171 |
end |
| 160 |
- output[name] = result |
|
| 161 |
- end |
|
| 162 | 172 |
|
| 163 |
- num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
|
|
| 173 |
+ num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
|
|
| 164 | 174 |
|
| 165 |
- if num_unique_lengths.length != 1 |
|
| 166 |
- error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
|
|
| 167 |
- return |
|
| 168 |
- end |
|
| 169 |
- |
|
| 170 |
- old_events = previous_payloads num_unique_lengths.first |
|
| 171 |
- num_unique_lengths.first.times do |index| |
|
| 172 |
- result = {}
|
|
| 173 |
- options['extract'].keys.each do |name| |
|
| 174 |
- result[name] = output[name][index] |
|
| 175 |
- if name.to_s == 'url' |
|
| 176 |
- result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
| 177 |
- end |
|
| 175 |
+ if num_unique_lengths.length != 1 |
|
| 176 |
+ error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
|
|
| 177 |
+ return |
|
| 178 | 178 |
end |
| 179 |
+ |
|
| 180 |
+ old_events = previous_payloads num_unique_lengths.first |
|
| 181 |
+ num_unique_lengths.first.times do |index| |
|
| 182 |
+ result = {}
|
|
| 183 |
+ options['extract'].keys.each do |name| |
|
| 184 |
+ result[name] = output[name][index] |
|
| 185 |
+ if name.to_s == 'url' |
|
| 186 |
+ result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil? |
|
| 187 |
+ end |
|
| 188 |
+ end |
|
| 179 | 189 |
|
| 180 |
- if store_payload!(old_events, result) |
|
| 181 |
- log "Storing new parsed result for '#{name}': #{result.inspect}"
|
|
| 182 |
- create_event :payload => result |
|
| 190 |
+ if store_payload!(old_events, result) |
|
| 191 |
+ log "Storing new parsed result for '#{name}': #{result.inspect}"
|
|
| 192 |
+ create_event :payload => result |
|
| 193 |
+ end |
|
| 183 | 194 |
end |
| 184 | 195 |
end |
| 185 | 196 |
end |
| 197 |
+ |
|
| 198 |
+ hydra.queue request |
|
| 199 |
+ hydra.run |
|
| 186 | 200 |
end |
| 187 |
- hydra.queue request |
|
| 188 |
- hydra.run |
|
| 189 | 201 |
end |
| 190 | 202 |
|
| 191 | 203 |
private |
@@ -91,6 +91,30 @@ describe Agents::WebsiteAgent do |
||
| 91 | 91 |
@checker.check |
| 92 | 92 |
@checker.logs.first.message.should =~ /Got an uneven number of matches/ |
| 93 | 93 |
end |
| 94 |
+ |
|
| 95 |
+ it "should accept an array for url" do |
|
| 96 |
+ @site['url'] = ["http://xkcd.com/1/", "http://xkcd.com/2/"] |
|
| 97 |
+ @checker.options = @site |
|
| 98 |
+ lambda { @checker.save! }.should_not raise_error;
|
|
| 99 |
+ lambda { @checker.check }.should_not raise_error;
|
|
| 100 |
+ end |
|
| 101 |
+ |
|
| 102 |
+ it "should parse events from all urls in array" do |
|
| 103 |
+ lambda {
|
|
| 104 |
+ @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"] |
|
| 105 |
+ @site['mode'] = 'all' |
|
| 106 |
+ @checker.options = @site |
|
| 107 |
+ @checker.check |
|
| 108 |
+ }.should change { Event.count }.by(2)
|
|
| 109 |
+ end |
|
| 110 |
+ |
|
| 111 |
+ it "should follow unique rules when parsing array of urls" do |
|
| 112 |
+ lambda {
|
|
| 113 |
+ @site['url'] = ["http://xkcd.com/", "http://xkcd.com/"] |
|
| 114 |
+ @checker.options = @site |
|
| 115 |
+ @checker.check |
|
| 116 |
+ }.should change { Event.count }.by(1)
|
|
| 117 |
+ end |
|
| 94 | 118 |
end |
| 95 | 119 |
|
| 96 | 120 |
describe 'encoding' do |